###Prerequisite
###Install the package if you haven't
setwd("~/git/")
require('devtools')
build('jkmeans')
install.packages("jkmeans_1.0.tar.gz", repos = NULL, type = "source")
Let \(X \sim \sum_i^k \pi_i \mathcal{N}_i(\mu_i, \sigma^2_i I)\). With \(k = 10\) and \(n = 1000\), \(N = nk\).
n <- 1e3
k <- 10
N <- n*k
set.seed(49)
(mux <- sort(sample(-20:20, 10)))# [1] -19 -17 -11 -10 -6 -3 -1 3 6 18
(muy <- sample(-20:20, 10))# [1] -10 -3 -16 5 16 -20 2 -13 -5 -6
(mu0 <- cbind(mux, muy))# mux muy
# [1,] -19 -10
# [2,] -17 -3
# [3,] -11 -16
# [4,] -10 5
# [5,] -6 16
# [6,] -3 -20
# [7,] -1 2
# [8,] 3 -13
# [9,] 6 -5
# [10,] 18 -6
(sigma1 <- sample(c(1,1,1,2,3,4,5), 10, replace=TRUE))# [1] 1 4 2 4 1 5 5 1 5 1
sigma0 <- lapply(sigma1, function(x) matrix(x*c(1,0,0,1), 2, 2))
Z <- rep(1:k, each = n)Here are the means
plot(mu0, col = 1:10, pch = 19, cex = 3)We now generate the data with
X0 <- data.frame(t(sapply(Z, function(x) rmvnorm(1, mu0[x,], sigma0[[x]]))), Z)
names(X0) <- c('x1', 'x2', 'Z')
head(X0[sample(dim(X0)[1]),])# x1 x2 Z
# 5564 -1.391007 -20.7115938 6
# 3350 -11.039710 4.7961061 4
# 1000 -19.664400 -9.6635771 1
# 8542 5.388799 -0.2730803 9
# 1467 -13.249741 1.0983979 2
# 3507 -11.265466 2.7168030 4
dat <- as.matrix(X0[, 1:2])And here are the data colored with the truth.
ggCol <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a')
ggplot(data = X0, aes(x=x1, y=x2, color = as.factor(Z))) +
scale_color_manual(values = ggCol) +
geom_point(alpha = 0.5)Run with \(jk\)-means with \(j=2, k=10\):
options: max iteration=100 convergence error tolerance = 1E-8
K <- k
j <- 5
set.seed(5431)
set.seed(2^10)
system.time(
jk <- jkmeans::jkmeansEM(dat, K, j , 100, tol = 1E-8)
)# user system elapsed
# 0.693 0.003 0.690
jkv <- apply(jk$zeta, 1, function(x) which( x == max(x) ))system.time({
kv <- kmeans(dat, centers = mu0)
}
)# user system elapsed
# 0.004 0.001 0.005
system.time(
mcv <- Mclust(dat,
G = K,
modelNames="EII")
)# user system elapsed
# 169.990 0.826 170.837
mcv# 'Mclust' model object:
# best model: spherical, equal volume (EII) with 10 components
# Names ari
# 1 Truth v km 0.9579922
# 2 Truth v jk 0.8206916
# 3 Truth v mclust 0.9563053
Y <- data.frame(X0, jk=jkv, kv = kv$cluster,
mc=mcv$classification, kvpp=kvpp$cluster)
p1 <- ggplot(data = Y, aes(x=x1, y=x2)) +
scale_color_manual(values = ggCol)